{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    T5ForConditionalGeneration,\n",
    "    AutoConfig,\n",
    ")\n",
    "from pytorch_lightning import LightningModule\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('./out-large-1.1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/utils/hub.py:700: UserWarning: The `organization` argument is deprecated and will be removed in v5 of Transformers. Set your organization directly in the `repo_id` passed instead (`repo_id={organization}/{model_id}`).\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/nanot5-large-malaysian-cased/commit/acba1bc9853b27259ee65886db2dfc67c48574e1', commit_message='Upload tokenizer', commit_description='', oid='acba1bc9853b27259ee65886db2dfc67c48574e1', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('nanot5-large-malaysian-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Module(LightningModule):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        config = AutoConfig.from_pretrained(\n",
    "            './out-large-1.1'\n",
    "        )\n",
    "        self.model = T5ForConditionalGeneration.from_pretrained(\n",
    "            './out-large-1.1',\n",
    "            config=config,\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 18G\r\n",
      "-rwxrwxrwx 1 ubuntu ubuntu 8.8G Oct 26 02:34 'model-epoch=00-step=1000.ckpt'\r\n",
      "-rwxrwxrwx 1 ubuntu ubuntu 8.8G Oct 26 03:59 'model-epoch=00-step=2000.ckpt'\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh logs/large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Module()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "weights = model.state_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "old_weights = torch.load('logs/large/model-epoch=00-step=2000.ckpt',\n",
    "                             map_location=torch.device('cpu'))['state_dict'].items()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model.shared.weight model.shared.weight\n",
      "model.encoder.embed_tokens.weight model.encoder.embed_tokens.weight\n",
      "model.encoder.block.0.layer.0.SelfAttention.q.weight model.encoder.block.0.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.0.layer.0.SelfAttention.k.weight model.encoder.block.0.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.0.layer.0.SelfAttention.v.weight model.encoder.block.0.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.0.layer.0.SelfAttention.o.weight model.encoder.block.0.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight model.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight\n",
      "model.encoder.block.0.layer.0.layer_norm.weight model.encoder.block.0.layer.0.layer_norm.weight\n",
      "model.encoder.block.0.layer.1.DenseReluDense.wi_0.weight model.encoder.block.0.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.0.layer.1.DenseReluDense.wi_1.weight model.encoder.block.0.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.0.layer.1.DenseReluDense.wo.weight model.encoder.block.0.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.0.layer.1.layer_norm.weight model.encoder.block.0.layer.1.layer_norm.weight\n",
      "model.encoder.block.1.layer.0.SelfAttention.q.weight model.encoder.block.1.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.1.layer.0.SelfAttention.k.weight model.encoder.block.1.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.1.layer.0.SelfAttention.v.weight model.encoder.block.1.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.1.layer.0.SelfAttention.o.weight model.encoder.block.1.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.1.layer.0.layer_norm.weight model.encoder.block.1.layer.0.layer_norm.weight\n",
      "model.encoder.block.1.layer.1.DenseReluDense.wi_0.weight model.encoder.block.1.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.1.layer.1.DenseReluDense.wi_1.weight model.encoder.block.1.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.1.layer.1.DenseReluDense.wo.weight model.encoder.block.1.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.1.layer.1.layer_norm.weight model.encoder.block.1.layer.1.layer_norm.weight\n",
      "model.encoder.block.2.layer.0.SelfAttention.q.weight model.encoder.block.2.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.2.layer.0.SelfAttention.k.weight model.encoder.block.2.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.2.layer.0.SelfAttention.v.weight model.encoder.block.2.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.2.layer.0.SelfAttention.o.weight model.encoder.block.2.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.2.layer.0.layer_norm.weight model.encoder.block.2.layer.0.layer_norm.weight\n",
      "model.encoder.block.2.layer.1.DenseReluDense.wi_0.weight model.encoder.block.2.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.2.layer.1.DenseReluDense.wi_1.weight model.encoder.block.2.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.2.layer.1.DenseReluDense.wo.weight model.encoder.block.2.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.2.layer.1.layer_norm.weight model.encoder.block.2.layer.1.layer_norm.weight\n",
      "model.encoder.block.3.layer.0.SelfAttention.q.weight model.encoder.block.3.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.3.layer.0.SelfAttention.k.weight model.encoder.block.3.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.3.layer.0.SelfAttention.v.weight model.encoder.block.3.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.3.layer.0.SelfAttention.o.weight model.encoder.block.3.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.3.layer.0.layer_norm.weight model.encoder.block.3.layer.0.layer_norm.weight\n",
      "model.encoder.block.3.layer.1.DenseReluDense.wi_0.weight model.encoder.block.3.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.3.layer.1.DenseReluDense.wi_1.weight model.encoder.block.3.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.3.layer.1.DenseReluDense.wo.weight model.encoder.block.3.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.3.layer.1.layer_norm.weight model.encoder.block.3.layer.1.layer_norm.weight\n",
      "model.encoder.block.4.layer.0.SelfAttention.q.weight model.encoder.block.4.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.4.layer.0.SelfAttention.k.weight model.encoder.block.4.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.4.layer.0.SelfAttention.v.weight model.encoder.block.4.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.4.layer.0.SelfAttention.o.weight model.encoder.block.4.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.4.layer.0.layer_norm.weight model.encoder.block.4.layer.0.layer_norm.weight\n",
      "model.encoder.block.4.layer.1.DenseReluDense.wi_0.weight model.encoder.block.4.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.4.layer.1.DenseReluDense.wi_1.weight model.encoder.block.4.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.4.layer.1.DenseReluDense.wo.weight model.encoder.block.4.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.4.layer.1.layer_norm.weight model.encoder.block.4.layer.1.layer_norm.weight\n",
      "model.encoder.block.5.layer.0.SelfAttention.q.weight model.encoder.block.5.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.5.layer.0.SelfAttention.k.weight model.encoder.block.5.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.5.layer.0.SelfAttention.v.weight model.encoder.block.5.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.5.layer.0.SelfAttention.o.weight model.encoder.block.5.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.5.layer.0.layer_norm.weight model.encoder.block.5.layer.0.layer_norm.weight\n",
      "model.encoder.block.5.layer.1.DenseReluDense.wi_0.weight model.encoder.block.5.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.5.layer.1.DenseReluDense.wi_1.weight model.encoder.block.5.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.5.layer.1.DenseReluDense.wo.weight model.encoder.block.5.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.5.layer.1.layer_norm.weight model.encoder.block.5.layer.1.layer_norm.weight\n",
      "model.encoder.block.6.layer.0.SelfAttention.q.weight model.encoder.block.6.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.6.layer.0.SelfAttention.k.weight model.encoder.block.6.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.6.layer.0.SelfAttention.v.weight model.encoder.block.6.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.6.layer.0.SelfAttention.o.weight model.encoder.block.6.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.6.layer.0.layer_norm.weight model.encoder.block.6.layer.0.layer_norm.weight\n",
      "model.encoder.block.6.layer.1.DenseReluDense.wi_0.weight model.encoder.block.6.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.6.layer.1.DenseReluDense.wi_1.weight model.encoder.block.6.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.6.layer.1.DenseReluDense.wo.weight model.encoder.block.6.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.6.layer.1.layer_norm.weight model.encoder.block.6.layer.1.layer_norm.weight\n",
      "model.encoder.block.7.layer.0.SelfAttention.q.weight model.encoder.block.7.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.7.layer.0.SelfAttention.k.weight model.encoder.block.7.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.7.layer.0.SelfAttention.v.weight model.encoder.block.7.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.7.layer.0.SelfAttention.o.weight model.encoder.block.7.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.7.layer.0.layer_norm.weight model.encoder.block.7.layer.0.layer_norm.weight\n",
      "model.encoder.block.7.layer.1.DenseReluDense.wi_0.weight model.encoder.block.7.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.7.layer.1.DenseReluDense.wi_1.weight model.encoder.block.7.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.7.layer.1.DenseReluDense.wo.weight model.encoder.block.7.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.7.layer.1.layer_norm.weight model.encoder.block.7.layer.1.layer_norm.weight\n",
      "model.encoder.block.8.layer.0.SelfAttention.q.weight model.encoder.block.8.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.8.layer.0.SelfAttention.k.weight model.encoder.block.8.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.8.layer.0.SelfAttention.v.weight model.encoder.block.8.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.8.layer.0.SelfAttention.o.weight model.encoder.block.8.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.8.layer.0.layer_norm.weight model.encoder.block.8.layer.0.layer_norm.weight\n",
      "model.encoder.block.8.layer.1.DenseReluDense.wi_0.weight model.encoder.block.8.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.8.layer.1.DenseReluDense.wi_1.weight model.encoder.block.8.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.8.layer.1.DenseReluDense.wo.weight model.encoder.block.8.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.8.layer.1.layer_norm.weight model.encoder.block.8.layer.1.layer_norm.weight\n",
      "model.encoder.block.9.layer.0.SelfAttention.q.weight model.encoder.block.9.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.9.layer.0.SelfAttention.k.weight model.encoder.block.9.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.9.layer.0.SelfAttention.v.weight model.encoder.block.9.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.9.layer.0.SelfAttention.o.weight model.encoder.block.9.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.9.layer.0.layer_norm.weight model.encoder.block.9.layer.0.layer_norm.weight\n",
      "model.encoder.block.9.layer.1.DenseReluDense.wi_0.weight model.encoder.block.9.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.9.layer.1.DenseReluDense.wi_1.weight model.encoder.block.9.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.9.layer.1.DenseReluDense.wo.weight model.encoder.block.9.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.9.layer.1.layer_norm.weight model.encoder.block.9.layer.1.layer_norm.weight\n",
      "model.encoder.block.10.layer.0.SelfAttention.q.weight model.encoder.block.10.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.10.layer.0.SelfAttention.k.weight model.encoder.block.10.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.10.layer.0.SelfAttention.v.weight model.encoder.block.10.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.10.layer.0.SelfAttention.o.weight model.encoder.block.10.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.10.layer.0.layer_norm.weight model.encoder.block.10.layer.0.layer_norm.weight\n",
      "model.encoder.block.10.layer.1.DenseReluDense.wi_0.weight model.encoder.block.10.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.10.layer.1.DenseReluDense.wi_1.weight model.encoder.block.10.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.10.layer.1.DenseReluDense.wo.weight model.encoder.block.10.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.10.layer.1.layer_norm.weight model.encoder.block.10.layer.1.layer_norm.weight\n",
      "model.encoder.block.11.layer.0.SelfAttention.q.weight model.encoder.block.11.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.11.layer.0.SelfAttention.k.weight model.encoder.block.11.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.11.layer.0.SelfAttention.v.weight model.encoder.block.11.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.11.layer.0.SelfAttention.o.weight model.encoder.block.11.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.11.layer.0.layer_norm.weight model.encoder.block.11.layer.0.layer_norm.weight\n",
      "model.encoder.block.11.layer.1.DenseReluDense.wi_0.weight model.encoder.block.11.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.11.layer.1.DenseReluDense.wi_1.weight model.encoder.block.11.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.11.layer.1.DenseReluDense.wo.weight model.encoder.block.11.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.11.layer.1.layer_norm.weight model.encoder.block.11.layer.1.layer_norm.weight\n",
      "model.encoder.block.12.layer.0.SelfAttention.q.weight model.encoder.block.12.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.12.layer.0.SelfAttention.k.weight model.encoder.block.12.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.12.layer.0.SelfAttention.v.weight model.encoder.block.12.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.12.layer.0.SelfAttention.o.weight model.encoder.block.12.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.12.layer.0.layer_norm.weight model.encoder.block.12.layer.0.layer_norm.weight\n",
      "model.encoder.block.12.layer.1.DenseReluDense.wi_0.weight model.encoder.block.12.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.12.layer.1.DenseReluDense.wi_1.weight model.encoder.block.12.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.12.layer.1.DenseReluDense.wo.weight model.encoder.block.12.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.12.layer.1.layer_norm.weight model.encoder.block.12.layer.1.layer_norm.weight\n",
      "model.encoder.block.13.layer.0.SelfAttention.q.weight model.encoder.block.13.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.13.layer.0.SelfAttention.k.weight model.encoder.block.13.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.13.layer.0.SelfAttention.v.weight model.encoder.block.13.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.13.layer.0.SelfAttention.o.weight model.encoder.block.13.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.13.layer.0.layer_norm.weight model.encoder.block.13.layer.0.layer_norm.weight\n",
      "model.encoder.block.13.layer.1.DenseReluDense.wi_0.weight model.encoder.block.13.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.13.layer.1.DenseReluDense.wi_1.weight model.encoder.block.13.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.13.layer.1.DenseReluDense.wo.weight model.encoder.block.13.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.13.layer.1.layer_norm.weight model.encoder.block.13.layer.1.layer_norm.weight\n",
      "model.encoder.block.14.layer.0.SelfAttention.q.weight model.encoder.block.14.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.14.layer.0.SelfAttention.k.weight model.encoder.block.14.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.14.layer.0.SelfAttention.v.weight model.encoder.block.14.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.14.layer.0.SelfAttention.o.weight model.encoder.block.14.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.14.layer.0.layer_norm.weight model.encoder.block.14.layer.0.layer_norm.weight\n",
      "model.encoder.block.14.layer.1.DenseReluDense.wi_0.weight model.encoder.block.14.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.14.layer.1.DenseReluDense.wi_1.weight model.encoder.block.14.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.14.layer.1.DenseReluDense.wo.weight model.encoder.block.14.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.14.layer.1.layer_norm.weight model.encoder.block.14.layer.1.layer_norm.weight\n",
      "model.encoder.block.15.layer.0.SelfAttention.q.weight model.encoder.block.15.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.15.layer.0.SelfAttention.k.weight model.encoder.block.15.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.15.layer.0.SelfAttention.v.weight model.encoder.block.15.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.15.layer.0.SelfAttention.o.weight model.encoder.block.15.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.15.layer.0.layer_norm.weight model.encoder.block.15.layer.0.layer_norm.weight\n",
      "model.encoder.block.15.layer.1.DenseReluDense.wi_0.weight model.encoder.block.15.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.15.layer.1.DenseReluDense.wi_1.weight model.encoder.block.15.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.15.layer.1.DenseReluDense.wo.weight model.encoder.block.15.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.15.layer.1.layer_norm.weight model.encoder.block.15.layer.1.layer_norm.weight\n",
      "model.encoder.block.16.layer.0.SelfAttention.q.weight model.encoder.block.16.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.16.layer.0.SelfAttention.k.weight model.encoder.block.16.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.16.layer.0.SelfAttention.v.weight model.encoder.block.16.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.16.layer.0.SelfAttention.o.weight model.encoder.block.16.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.16.layer.0.layer_norm.weight model.encoder.block.16.layer.0.layer_norm.weight\n",
      "model.encoder.block.16.layer.1.DenseReluDense.wi_0.weight model.encoder.block.16.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.16.layer.1.DenseReluDense.wi_1.weight model.encoder.block.16.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.16.layer.1.DenseReluDense.wo.weight model.encoder.block.16.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.16.layer.1.layer_norm.weight model.encoder.block.16.layer.1.layer_norm.weight\n",
      "model.encoder.block.17.layer.0.SelfAttention.q.weight model.encoder.block.17.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.17.layer.0.SelfAttention.k.weight model.encoder.block.17.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.17.layer.0.SelfAttention.v.weight model.encoder.block.17.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.17.layer.0.SelfAttention.o.weight model.encoder.block.17.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.17.layer.0.layer_norm.weight model.encoder.block.17.layer.0.layer_norm.weight\n",
      "model.encoder.block.17.layer.1.DenseReluDense.wi_0.weight model.encoder.block.17.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.17.layer.1.DenseReluDense.wi_1.weight model.encoder.block.17.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.17.layer.1.DenseReluDense.wo.weight model.encoder.block.17.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.17.layer.1.layer_norm.weight model.encoder.block.17.layer.1.layer_norm.weight\n",
      "model.encoder.block.18.layer.0.SelfAttention.q.weight model.encoder.block.18.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.18.layer.0.SelfAttention.k.weight model.encoder.block.18.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.18.layer.0.SelfAttention.v.weight model.encoder.block.18.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.18.layer.0.SelfAttention.o.weight model.encoder.block.18.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.18.layer.0.layer_norm.weight model.encoder.block.18.layer.0.layer_norm.weight\n",
      "model.encoder.block.18.layer.1.DenseReluDense.wi_0.weight model.encoder.block.18.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.18.layer.1.DenseReluDense.wi_1.weight model.encoder.block.18.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.18.layer.1.DenseReluDense.wo.weight model.encoder.block.18.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.18.layer.1.layer_norm.weight model.encoder.block.18.layer.1.layer_norm.weight\n",
      "model.encoder.block.19.layer.0.SelfAttention.q.weight model.encoder.block.19.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.19.layer.0.SelfAttention.k.weight model.encoder.block.19.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.19.layer.0.SelfAttention.v.weight model.encoder.block.19.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.19.layer.0.SelfAttention.o.weight model.encoder.block.19.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.19.layer.0.layer_norm.weight model.encoder.block.19.layer.0.layer_norm.weight\n",
      "model.encoder.block.19.layer.1.DenseReluDense.wi_0.weight model.encoder.block.19.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.19.layer.1.DenseReluDense.wi_1.weight model.encoder.block.19.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.19.layer.1.DenseReluDense.wo.weight model.encoder.block.19.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.19.layer.1.layer_norm.weight model.encoder.block.19.layer.1.layer_norm.weight\n",
      "model.encoder.block.20.layer.0.SelfAttention.q.weight model.encoder.block.20.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.20.layer.0.SelfAttention.k.weight model.encoder.block.20.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.20.layer.0.SelfAttention.v.weight model.encoder.block.20.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.20.layer.0.SelfAttention.o.weight model.encoder.block.20.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.20.layer.0.layer_norm.weight model.encoder.block.20.layer.0.layer_norm.weight\n",
      "model.encoder.block.20.layer.1.DenseReluDense.wi_0.weight model.encoder.block.20.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.20.layer.1.DenseReluDense.wi_1.weight model.encoder.block.20.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.20.layer.1.DenseReluDense.wo.weight model.encoder.block.20.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.20.layer.1.layer_norm.weight model.encoder.block.20.layer.1.layer_norm.weight\n",
      "model.encoder.block.21.layer.0.SelfAttention.q.weight model.encoder.block.21.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.21.layer.0.SelfAttention.k.weight model.encoder.block.21.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.21.layer.0.SelfAttention.v.weight model.encoder.block.21.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.21.layer.0.SelfAttention.o.weight model.encoder.block.21.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.21.layer.0.layer_norm.weight model.encoder.block.21.layer.0.layer_norm.weight\n",
      "model.encoder.block.21.layer.1.DenseReluDense.wi_0.weight model.encoder.block.21.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.21.layer.1.DenseReluDense.wi_1.weight model.encoder.block.21.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.21.layer.1.DenseReluDense.wo.weight model.encoder.block.21.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.21.layer.1.layer_norm.weight model.encoder.block.21.layer.1.layer_norm.weight\n",
      "model.encoder.block.22.layer.0.SelfAttention.q.weight model.encoder.block.22.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.22.layer.0.SelfAttention.k.weight model.encoder.block.22.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.22.layer.0.SelfAttention.v.weight model.encoder.block.22.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.22.layer.0.SelfAttention.o.weight model.encoder.block.22.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.22.layer.0.layer_norm.weight model.encoder.block.22.layer.0.layer_norm.weight\n",
      "model.encoder.block.22.layer.1.DenseReluDense.wi_0.weight model.encoder.block.22.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.22.layer.1.DenseReluDense.wi_1.weight model.encoder.block.22.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.22.layer.1.DenseReluDense.wo.weight model.encoder.block.22.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.22.layer.1.layer_norm.weight model.encoder.block.22.layer.1.layer_norm.weight\n",
      "model.encoder.block.23.layer.0.SelfAttention.q.weight model.encoder.block.23.layer.0.SelfAttention.q.weight\n",
      "model.encoder.block.23.layer.0.SelfAttention.k.weight model.encoder.block.23.layer.0.SelfAttention.k.weight\n",
      "model.encoder.block.23.layer.0.SelfAttention.v.weight model.encoder.block.23.layer.0.SelfAttention.v.weight\n",
      "model.encoder.block.23.layer.0.SelfAttention.o.weight model.encoder.block.23.layer.0.SelfAttention.o.weight\n",
      "model.encoder.block.23.layer.0.layer_norm.weight model.encoder.block.23.layer.0.layer_norm.weight\n",
      "model.encoder.block.23.layer.1.DenseReluDense.wi_0.weight model.encoder.block.23.layer.1.DenseReluDense.wi_0.weight\n",
      "model.encoder.block.23.layer.1.DenseReluDense.wi_1.weight model.encoder.block.23.layer.1.DenseReluDense.wi_1.weight\n",
      "model.encoder.block.23.layer.1.DenseReluDense.wo.weight model.encoder.block.23.layer.1.DenseReluDense.wo.weight\n",
      "model.encoder.block.23.layer.1.layer_norm.weight model.encoder.block.23.layer.1.layer_norm.weight\n",
      "model.encoder.final_layer_norm.weight model.encoder.final_layer_norm.weight\n",
      "model.decoder.embed_tokens.weight model.decoder.embed_tokens.weight\n",
      "model.decoder.block.0.layer.0.SelfAttention.q.weight model.decoder.block.0.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.0.layer.0.SelfAttention.k.weight model.decoder.block.0.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.0.layer.0.SelfAttention.v.weight model.decoder.block.0.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.0.layer.0.SelfAttention.o.weight model.decoder.block.0.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight model.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight\n",
      "model.decoder.block.0.layer.0.layer_norm.weight model.decoder.block.0.layer.0.layer_norm.weight\n",
      "model.decoder.block.0.layer.1.EncDecAttention.q.weight model.decoder.block.0.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.0.layer.1.EncDecAttention.k.weight model.decoder.block.0.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.0.layer.1.EncDecAttention.v.weight model.decoder.block.0.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.0.layer.1.EncDecAttention.o.weight model.decoder.block.0.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.0.layer.1.layer_norm.weight model.decoder.block.0.layer.1.layer_norm.weight\n",
      "model.decoder.block.0.layer.2.DenseReluDense.wi_0.weight model.decoder.block.0.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.0.layer.2.DenseReluDense.wi_1.weight model.decoder.block.0.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.0.layer.2.DenseReluDense.wo.weight model.decoder.block.0.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.0.layer.2.layer_norm.weight model.decoder.block.0.layer.2.layer_norm.weight\n",
      "model.decoder.block.1.layer.0.SelfAttention.q.weight model.decoder.block.1.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.1.layer.0.SelfAttention.k.weight model.decoder.block.1.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.1.layer.0.SelfAttention.v.weight model.decoder.block.1.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.1.layer.0.SelfAttention.o.weight model.decoder.block.1.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.1.layer.0.layer_norm.weight model.decoder.block.1.layer.0.layer_norm.weight\n",
      "model.decoder.block.1.layer.1.EncDecAttention.q.weight model.decoder.block.1.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.1.layer.1.EncDecAttention.k.weight model.decoder.block.1.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.1.layer.1.EncDecAttention.v.weight model.decoder.block.1.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.1.layer.1.EncDecAttention.o.weight model.decoder.block.1.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.1.layer.1.layer_norm.weight model.decoder.block.1.layer.1.layer_norm.weight\n",
      "model.decoder.block.1.layer.2.DenseReluDense.wi_0.weight model.decoder.block.1.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.1.layer.2.DenseReluDense.wi_1.weight model.decoder.block.1.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.1.layer.2.DenseReluDense.wo.weight model.decoder.block.1.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.1.layer.2.layer_norm.weight model.decoder.block.1.layer.2.layer_norm.weight\n",
      "model.decoder.block.2.layer.0.SelfAttention.q.weight model.decoder.block.2.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.2.layer.0.SelfAttention.k.weight model.decoder.block.2.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.2.layer.0.SelfAttention.v.weight model.decoder.block.2.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.2.layer.0.SelfAttention.o.weight model.decoder.block.2.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.2.layer.0.layer_norm.weight model.decoder.block.2.layer.0.layer_norm.weight\n",
      "model.decoder.block.2.layer.1.EncDecAttention.q.weight model.decoder.block.2.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.2.layer.1.EncDecAttention.k.weight model.decoder.block.2.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.2.layer.1.EncDecAttention.v.weight model.decoder.block.2.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.2.layer.1.EncDecAttention.o.weight model.decoder.block.2.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.2.layer.1.layer_norm.weight model.decoder.block.2.layer.1.layer_norm.weight\n",
      "model.decoder.block.2.layer.2.DenseReluDense.wi_0.weight model.decoder.block.2.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.2.layer.2.DenseReluDense.wi_1.weight model.decoder.block.2.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.2.layer.2.DenseReluDense.wo.weight model.decoder.block.2.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.2.layer.2.layer_norm.weight model.decoder.block.2.layer.2.layer_norm.weight\n",
      "model.decoder.block.3.layer.0.SelfAttention.q.weight model.decoder.block.3.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.3.layer.0.SelfAttention.k.weight model.decoder.block.3.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.3.layer.0.SelfAttention.v.weight model.decoder.block.3.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.3.layer.0.SelfAttention.o.weight model.decoder.block.3.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.3.layer.0.layer_norm.weight model.decoder.block.3.layer.0.layer_norm.weight\n",
      "model.decoder.block.3.layer.1.EncDecAttention.q.weight model.decoder.block.3.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.3.layer.1.EncDecAttention.k.weight model.decoder.block.3.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.3.layer.1.EncDecAttention.v.weight model.decoder.block.3.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.3.layer.1.EncDecAttention.o.weight model.decoder.block.3.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.3.layer.1.layer_norm.weight model.decoder.block.3.layer.1.layer_norm.weight\n",
      "model.decoder.block.3.layer.2.DenseReluDense.wi_0.weight model.decoder.block.3.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.3.layer.2.DenseReluDense.wi_1.weight model.decoder.block.3.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.3.layer.2.DenseReluDense.wo.weight model.decoder.block.3.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.3.layer.2.layer_norm.weight model.decoder.block.3.layer.2.layer_norm.weight\n",
      "model.decoder.block.4.layer.0.SelfAttention.q.weight model.decoder.block.4.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.4.layer.0.SelfAttention.k.weight model.decoder.block.4.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.4.layer.0.SelfAttention.v.weight model.decoder.block.4.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.4.layer.0.SelfAttention.o.weight model.decoder.block.4.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.4.layer.0.layer_norm.weight model.decoder.block.4.layer.0.layer_norm.weight\n",
      "model.decoder.block.4.layer.1.EncDecAttention.q.weight model.decoder.block.4.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.4.layer.1.EncDecAttention.k.weight model.decoder.block.4.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.4.layer.1.EncDecAttention.v.weight model.decoder.block.4.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.4.layer.1.EncDecAttention.o.weight model.decoder.block.4.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.4.layer.1.layer_norm.weight model.decoder.block.4.layer.1.layer_norm.weight\n",
      "model.decoder.block.4.layer.2.DenseReluDense.wi_0.weight model.decoder.block.4.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.4.layer.2.DenseReluDense.wi_1.weight model.decoder.block.4.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.4.layer.2.DenseReluDense.wo.weight model.decoder.block.4.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.4.layer.2.layer_norm.weight model.decoder.block.4.layer.2.layer_norm.weight\n",
      "model.decoder.block.5.layer.0.SelfAttention.q.weight model.decoder.block.5.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.5.layer.0.SelfAttention.k.weight model.decoder.block.5.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.5.layer.0.SelfAttention.v.weight model.decoder.block.5.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.5.layer.0.SelfAttention.o.weight model.decoder.block.5.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.5.layer.0.layer_norm.weight model.decoder.block.5.layer.0.layer_norm.weight\n",
      "model.decoder.block.5.layer.1.EncDecAttention.q.weight model.decoder.block.5.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.5.layer.1.EncDecAttention.k.weight model.decoder.block.5.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.5.layer.1.EncDecAttention.v.weight model.decoder.block.5.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.5.layer.1.EncDecAttention.o.weight model.decoder.block.5.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.5.layer.1.layer_norm.weight model.decoder.block.5.layer.1.layer_norm.weight\n",
      "model.decoder.block.5.layer.2.DenseReluDense.wi_0.weight model.decoder.block.5.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.5.layer.2.DenseReluDense.wi_1.weight model.decoder.block.5.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.5.layer.2.DenseReluDense.wo.weight model.decoder.block.5.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.5.layer.2.layer_norm.weight model.decoder.block.5.layer.2.layer_norm.weight\n",
      "model.decoder.block.6.layer.0.SelfAttention.q.weight model.decoder.block.6.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.6.layer.0.SelfAttention.k.weight model.decoder.block.6.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.6.layer.0.SelfAttention.v.weight model.decoder.block.6.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.6.layer.0.SelfAttention.o.weight model.decoder.block.6.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.6.layer.0.layer_norm.weight model.decoder.block.6.layer.0.layer_norm.weight\n",
      "model.decoder.block.6.layer.1.EncDecAttention.q.weight model.decoder.block.6.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.6.layer.1.EncDecAttention.k.weight model.decoder.block.6.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.6.layer.1.EncDecAttention.v.weight model.decoder.block.6.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.6.layer.1.EncDecAttention.o.weight model.decoder.block.6.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.6.layer.1.layer_norm.weight model.decoder.block.6.layer.1.layer_norm.weight\n",
      "model.decoder.block.6.layer.2.DenseReluDense.wi_0.weight model.decoder.block.6.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.6.layer.2.DenseReluDense.wi_1.weight model.decoder.block.6.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.6.layer.2.DenseReluDense.wo.weight model.decoder.block.6.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.6.layer.2.layer_norm.weight model.decoder.block.6.layer.2.layer_norm.weight\n",
      "model.decoder.block.7.layer.0.SelfAttention.q.weight model.decoder.block.7.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.7.layer.0.SelfAttention.k.weight model.decoder.block.7.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.7.layer.0.SelfAttention.v.weight model.decoder.block.7.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.7.layer.0.SelfAttention.o.weight model.decoder.block.7.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.7.layer.0.layer_norm.weight model.decoder.block.7.layer.0.layer_norm.weight\n",
      "model.decoder.block.7.layer.1.EncDecAttention.q.weight model.decoder.block.7.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.7.layer.1.EncDecAttention.k.weight model.decoder.block.7.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.7.layer.1.EncDecAttention.v.weight model.decoder.block.7.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.7.layer.1.EncDecAttention.o.weight model.decoder.block.7.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.7.layer.1.layer_norm.weight model.decoder.block.7.layer.1.layer_norm.weight\n",
      "model.decoder.block.7.layer.2.DenseReluDense.wi_0.weight model.decoder.block.7.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.7.layer.2.DenseReluDense.wi_1.weight model.decoder.block.7.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.7.layer.2.DenseReluDense.wo.weight model.decoder.block.7.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.7.layer.2.layer_norm.weight model.decoder.block.7.layer.2.layer_norm.weight\n",
      "model.decoder.block.8.layer.0.SelfAttention.q.weight model.decoder.block.8.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.8.layer.0.SelfAttention.k.weight model.decoder.block.8.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.8.layer.0.SelfAttention.v.weight model.decoder.block.8.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.8.layer.0.SelfAttention.o.weight model.decoder.block.8.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.8.layer.0.layer_norm.weight model.decoder.block.8.layer.0.layer_norm.weight\n",
      "model.decoder.block.8.layer.1.EncDecAttention.q.weight model.decoder.block.8.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.8.layer.1.EncDecAttention.k.weight model.decoder.block.8.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.8.layer.1.EncDecAttention.v.weight model.decoder.block.8.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.8.layer.1.EncDecAttention.o.weight model.decoder.block.8.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.8.layer.1.layer_norm.weight model.decoder.block.8.layer.1.layer_norm.weight\n",
      "model.decoder.block.8.layer.2.DenseReluDense.wi_0.weight model.decoder.block.8.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.8.layer.2.DenseReluDense.wi_1.weight model.decoder.block.8.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.8.layer.2.DenseReluDense.wo.weight model.decoder.block.8.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.8.layer.2.layer_norm.weight model.decoder.block.8.layer.2.layer_norm.weight\n",
      "model.decoder.block.9.layer.0.SelfAttention.q.weight model.decoder.block.9.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.9.layer.0.SelfAttention.k.weight model.decoder.block.9.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.9.layer.0.SelfAttention.v.weight model.decoder.block.9.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.9.layer.0.SelfAttention.o.weight model.decoder.block.9.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.9.layer.0.layer_norm.weight model.decoder.block.9.layer.0.layer_norm.weight\n",
      "model.decoder.block.9.layer.1.EncDecAttention.q.weight model.decoder.block.9.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.9.layer.1.EncDecAttention.k.weight model.decoder.block.9.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.9.layer.1.EncDecAttention.v.weight model.decoder.block.9.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.9.layer.1.EncDecAttention.o.weight model.decoder.block.9.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.9.layer.1.layer_norm.weight model.decoder.block.9.layer.1.layer_norm.weight\n",
      "model.decoder.block.9.layer.2.DenseReluDense.wi_0.weight model.decoder.block.9.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.9.layer.2.DenseReluDense.wi_1.weight model.decoder.block.9.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.9.layer.2.DenseReluDense.wo.weight model.decoder.block.9.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.9.layer.2.layer_norm.weight model.decoder.block.9.layer.2.layer_norm.weight\n",
      "model.decoder.block.10.layer.0.SelfAttention.q.weight model.decoder.block.10.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.10.layer.0.SelfAttention.k.weight model.decoder.block.10.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.10.layer.0.SelfAttention.v.weight model.decoder.block.10.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.10.layer.0.SelfAttention.o.weight model.decoder.block.10.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.10.layer.0.layer_norm.weight model.decoder.block.10.layer.0.layer_norm.weight\n",
      "model.decoder.block.10.layer.1.EncDecAttention.q.weight model.decoder.block.10.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.10.layer.1.EncDecAttention.k.weight model.decoder.block.10.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.10.layer.1.EncDecAttention.v.weight model.decoder.block.10.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.10.layer.1.EncDecAttention.o.weight model.decoder.block.10.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.10.layer.1.layer_norm.weight model.decoder.block.10.layer.1.layer_norm.weight\n",
      "model.decoder.block.10.layer.2.DenseReluDense.wi_0.weight model.decoder.block.10.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.10.layer.2.DenseReluDense.wi_1.weight model.decoder.block.10.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.10.layer.2.DenseReluDense.wo.weight model.decoder.block.10.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.10.layer.2.layer_norm.weight model.decoder.block.10.layer.2.layer_norm.weight\n",
      "model.decoder.block.11.layer.0.SelfAttention.q.weight model.decoder.block.11.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.11.layer.0.SelfAttention.k.weight model.decoder.block.11.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.11.layer.0.SelfAttention.v.weight model.decoder.block.11.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.11.layer.0.SelfAttention.o.weight model.decoder.block.11.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.11.layer.0.layer_norm.weight model.decoder.block.11.layer.0.layer_norm.weight\n",
      "model.decoder.block.11.layer.1.EncDecAttention.q.weight model.decoder.block.11.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.11.layer.1.EncDecAttention.k.weight model.decoder.block.11.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.11.layer.1.EncDecAttention.v.weight model.decoder.block.11.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.11.layer.1.EncDecAttention.o.weight model.decoder.block.11.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.11.layer.1.layer_norm.weight model.decoder.block.11.layer.1.layer_norm.weight\n",
      "model.decoder.block.11.layer.2.DenseReluDense.wi_0.weight model.decoder.block.11.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.11.layer.2.DenseReluDense.wi_1.weight model.decoder.block.11.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.11.layer.2.DenseReluDense.wo.weight model.decoder.block.11.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.11.layer.2.layer_norm.weight model.decoder.block.11.layer.2.layer_norm.weight\n",
      "model.decoder.block.12.layer.0.SelfAttention.q.weight model.decoder.block.12.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.12.layer.0.SelfAttention.k.weight model.decoder.block.12.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.12.layer.0.SelfAttention.v.weight model.decoder.block.12.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.12.layer.0.SelfAttention.o.weight model.decoder.block.12.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.12.layer.0.layer_norm.weight model.decoder.block.12.layer.0.layer_norm.weight\n",
      "model.decoder.block.12.layer.1.EncDecAttention.q.weight model.decoder.block.12.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.12.layer.1.EncDecAttention.k.weight model.decoder.block.12.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.12.layer.1.EncDecAttention.v.weight model.decoder.block.12.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.12.layer.1.EncDecAttention.o.weight model.decoder.block.12.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.12.layer.1.layer_norm.weight model.decoder.block.12.layer.1.layer_norm.weight\n",
      "model.decoder.block.12.layer.2.DenseReluDense.wi_0.weight model.decoder.block.12.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.12.layer.2.DenseReluDense.wi_1.weight model.decoder.block.12.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.12.layer.2.DenseReluDense.wo.weight model.decoder.block.12.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.12.layer.2.layer_norm.weight model.decoder.block.12.layer.2.layer_norm.weight\n",
      "model.decoder.block.13.layer.0.SelfAttention.q.weight model.decoder.block.13.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.13.layer.0.SelfAttention.k.weight model.decoder.block.13.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.13.layer.0.SelfAttention.v.weight model.decoder.block.13.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.13.layer.0.SelfAttention.o.weight model.decoder.block.13.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.13.layer.0.layer_norm.weight model.decoder.block.13.layer.0.layer_norm.weight\n",
      "model.decoder.block.13.layer.1.EncDecAttention.q.weight model.decoder.block.13.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.13.layer.1.EncDecAttention.k.weight model.decoder.block.13.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.13.layer.1.EncDecAttention.v.weight model.decoder.block.13.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.13.layer.1.EncDecAttention.o.weight model.decoder.block.13.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.13.layer.1.layer_norm.weight model.decoder.block.13.layer.1.layer_norm.weight\n",
      "model.decoder.block.13.layer.2.DenseReluDense.wi_0.weight model.decoder.block.13.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.13.layer.2.DenseReluDense.wi_1.weight model.decoder.block.13.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.13.layer.2.DenseReluDense.wo.weight model.decoder.block.13.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.13.layer.2.layer_norm.weight model.decoder.block.13.layer.2.layer_norm.weight\n",
      "model.decoder.block.14.layer.0.SelfAttention.q.weight model.decoder.block.14.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.14.layer.0.SelfAttention.k.weight model.decoder.block.14.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.14.layer.0.SelfAttention.v.weight model.decoder.block.14.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.14.layer.0.SelfAttention.o.weight model.decoder.block.14.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.14.layer.0.layer_norm.weight model.decoder.block.14.layer.0.layer_norm.weight\n",
      "model.decoder.block.14.layer.1.EncDecAttention.q.weight model.decoder.block.14.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.14.layer.1.EncDecAttention.k.weight model.decoder.block.14.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.14.layer.1.EncDecAttention.v.weight model.decoder.block.14.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.14.layer.1.EncDecAttention.o.weight model.decoder.block.14.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.14.layer.1.layer_norm.weight model.decoder.block.14.layer.1.layer_norm.weight\n",
      "model.decoder.block.14.layer.2.DenseReluDense.wi_0.weight model.decoder.block.14.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.14.layer.2.DenseReluDense.wi_1.weight model.decoder.block.14.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.14.layer.2.DenseReluDense.wo.weight model.decoder.block.14.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.14.layer.2.layer_norm.weight model.decoder.block.14.layer.2.layer_norm.weight\n",
      "model.decoder.block.15.layer.0.SelfAttention.q.weight model.decoder.block.15.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.15.layer.0.SelfAttention.k.weight model.decoder.block.15.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.15.layer.0.SelfAttention.v.weight model.decoder.block.15.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.15.layer.0.SelfAttention.o.weight model.decoder.block.15.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.15.layer.0.layer_norm.weight model.decoder.block.15.layer.0.layer_norm.weight\n",
      "model.decoder.block.15.layer.1.EncDecAttention.q.weight model.decoder.block.15.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.15.layer.1.EncDecAttention.k.weight model.decoder.block.15.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.15.layer.1.EncDecAttention.v.weight model.decoder.block.15.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.15.layer.1.EncDecAttention.o.weight model.decoder.block.15.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.15.layer.1.layer_norm.weight model.decoder.block.15.layer.1.layer_norm.weight\n",
      "model.decoder.block.15.layer.2.DenseReluDense.wi_0.weight model.decoder.block.15.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.15.layer.2.DenseReluDense.wi_1.weight model.decoder.block.15.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.15.layer.2.DenseReluDense.wo.weight model.decoder.block.15.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.15.layer.2.layer_norm.weight model.decoder.block.15.layer.2.layer_norm.weight\n",
      "model.decoder.block.16.layer.0.SelfAttention.q.weight model.decoder.block.16.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.16.layer.0.SelfAttention.k.weight model.decoder.block.16.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.16.layer.0.SelfAttention.v.weight model.decoder.block.16.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.16.layer.0.SelfAttention.o.weight model.decoder.block.16.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.16.layer.0.layer_norm.weight model.decoder.block.16.layer.0.layer_norm.weight\n",
      "model.decoder.block.16.layer.1.EncDecAttention.q.weight model.decoder.block.16.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.16.layer.1.EncDecAttention.k.weight model.decoder.block.16.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.16.layer.1.EncDecAttention.v.weight model.decoder.block.16.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.16.layer.1.EncDecAttention.o.weight model.decoder.block.16.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.16.layer.1.layer_norm.weight model.decoder.block.16.layer.1.layer_norm.weight\n",
      "model.decoder.block.16.layer.2.DenseReluDense.wi_0.weight model.decoder.block.16.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.16.layer.2.DenseReluDense.wi_1.weight model.decoder.block.16.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.16.layer.2.DenseReluDense.wo.weight model.decoder.block.16.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.16.layer.2.layer_norm.weight model.decoder.block.16.layer.2.layer_norm.weight\n",
      "model.decoder.block.17.layer.0.SelfAttention.q.weight model.decoder.block.17.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.17.layer.0.SelfAttention.k.weight model.decoder.block.17.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.17.layer.0.SelfAttention.v.weight model.decoder.block.17.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.17.layer.0.SelfAttention.o.weight model.decoder.block.17.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.17.layer.0.layer_norm.weight model.decoder.block.17.layer.0.layer_norm.weight\n",
      "model.decoder.block.17.layer.1.EncDecAttention.q.weight model.decoder.block.17.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.17.layer.1.EncDecAttention.k.weight model.decoder.block.17.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.17.layer.1.EncDecAttention.v.weight model.decoder.block.17.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.17.layer.1.EncDecAttention.o.weight model.decoder.block.17.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.17.layer.1.layer_norm.weight model.decoder.block.17.layer.1.layer_norm.weight\n",
      "model.decoder.block.17.layer.2.DenseReluDense.wi_0.weight model.decoder.block.17.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.17.layer.2.DenseReluDense.wi_1.weight model.decoder.block.17.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.17.layer.2.DenseReluDense.wo.weight model.decoder.block.17.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.17.layer.2.layer_norm.weight model.decoder.block.17.layer.2.layer_norm.weight\n",
      "model.decoder.block.18.layer.0.SelfAttention.q.weight model.decoder.block.18.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.18.layer.0.SelfAttention.k.weight model.decoder.block.18.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.18.layer.0.SelfAttention.v.weight model.decoder.block.18.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.18.layer.0.SelfAttention.o.weight model.decoder.block.18.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.18.layer.0.layer_norm.weight model.decoder.block.18.layer.0.layer_norm.weight\n",
      "model.decoder.block.18.layer.1.EncDecAttention.q.weight model.decoder.block.18.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.18.layer.1.EncDecAttention.k.weight model.decoder.block.18.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.18.layer.1.EncDecAttention.v.weight model.decoder.block.18.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.18.layer.1.EncDecAttention.o.weight model.decoder.block.18.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.18.layer.1.layer_norm.weight model.decoder.block.18.layer.1.layer_norm.weight\n",
      "model.decoder.block.18.layer.2.DenseReluDense.wi_0.weight model.decoder.block.18.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.18.layer.2.DenseReluDense.wi_1.weight model.decoder.block.18.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.18.layer.2.DenseReluDense.wo.weight model.decoder.block.18.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.18.layer.2.layer_norm.weight model.decoder.block.18.layer.2.layer_norm.weight\n",
      "model.decoder.block.19.layer.0.SelfAttention.q.weight model.decoder.block.19.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.19.layer.0.SelfAttention.k.weight model.decoder.block.19.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.19.layer.0.SelfAttention.v.weight model.decoder.block.19.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.19.layer.0.SelfAttention.o.weight model.decoder.block.19.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.19.layer.0.layer_norm.weight model.decoder.block.19.layer.0.layer_norm.weight\n",
      "model.decoder.block.19.layer.1.EncDecAttention.q.weight model.decoder.block.19.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.19.layer.1.EncDecAttention.k.weight model.decoder.block.19.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.19.layer.1.EncDecAttention.v.weight model.decoder.block.19.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.19.layer.1.EncDecAttention.o.weight model.decoder.block.19.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.19.layer.1.layer_norm.weight model.decoder.block.19.layer.1.layer_norm.weight\n",
      "model.decoder.block.19.layer.2.DenseReluDense.wi_0.weight model.decoder.block.19.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.19.layer.2.DenseReluDense.wi_1.weight model.decoder.block.19.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.19.layer.2.DenseReluDense.wo.weight model.decoder.block.19.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.19.layer.2.layer_norm.weight model.decoder.block.19.layer.2.layer_norm.weight\n",
      "model.decoder.block.20.layer.0.SelfAttention.q.weight model.decoder.block.20.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.20.layer.0.SelfAttention.k.weight model.decoder.block.20.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.20.layer.0.SelfAttention.v.weight model.decoder.block.20.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.20.layer.0.SelfAttention.o.weight model.decoder.block.20.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.20.layer.0.layer_norm.weight model.decoder.block.20.layer.0.layer_norm.weight\n",
      "model.decoder.block.20.layer.1.EncDecAttention.q.weight model.decoder.block.20.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.20.layer.1.EncDecAttention.k.weight model.decoder.block.20.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.20.layer.1.EncDecAttention.v.weight model.decoder.block.20.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.20.layer.1.EncDecAttention.o.weight model.decoder.block.20.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.20.layer.1.layer_norm.weight model.decoder.block.20.layer.1.layer_norm.weight\n",
      "model.decoder.block.20.layer.2.DenseReluDense.wi_0.weight model.decoder.block.20.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.20.layer.2.DenseReluDense.wi_1.weight model.decoder.block.20.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.20.layer.2.DenseReluDense.wo.weight model.decoder.block.20.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.20.layer.2.layer_norm.weight model.decoder.block.20.layer.2.layer_norm.weight\n",
      "model.decoder.block.21.layer.0.SelfAttention.q.weight model.decoder.block.21.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.21.layer.0.SelfAttention.k.weight model.decoder.block.21.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.21.layer.0.SelfAttention.v.weight model.decoder.block.21.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.21.layer.0.SelfAttention.o.weight model.decoder.block.21.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.21.layer.0.layer_norm.weight model.decoder.block.21.layer.0.layer_norm.weight\n",
      "model.decoder.block.21.layer.1.EncDecAttention.q.weight model.decoder.block.21.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.21.layer.1.EncDecAttention.k.weight model.decoder.block.21.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.21.layer.1.EncDecAttention.v.weight model.decoder.block.21.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.21.layer.1.EncDecAttention.o.weight model.decoder.block.21.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.21.layer.1.layer_norm.weight model.decoder.block.21.layer.1.layer_norm.weight\n",
      "model.decoder.block.21.layer.2.DenseReluDense.wi_0.weight model.decoder.block.21.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.21.layer.2.DenseReluDense.wi_1.weight model.decoder.block.21.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.21.layer.2.DenseReluDense.wo.weight model.decoder.block.21.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.21.layer.2.layer_norm.weight model.decoder.block.21.layer.2.layer_norm.weight\n",
      "model.decoder.block.22.layer.0.SelfAttention.q.weight model.decoder.block.22.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.22.layer.0.SelfAttention.k.weight model.decoder.block.22.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.22.layer.0.SelfAttention.v.weight model.decoder.block.22.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.22.layer.0.SelfAttention.o.weight model.decoder.block.22.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.22.layer.0.layer_norm.weight model.decoder.block.22.layer.0.layer_norm.weight\n",
      "model.decoder.block.22.layer.1.EncDecAttention.q.weight model.decoder.block.22.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.22.layer.1.EncDecAttention.k.weight model.decoder.block.22.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.22.layer.1.EncDecAttention.v.weight model.decoder.block.22.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.22.layer.1.EncDecAttention.o.weight model.decoder.block.22.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.22.layer.1.layer_norm.weight model.decoder.block.22.layer.1.layer_norm.weight\n",
      "model.decoder.block.22.layer.2.DenseReluDense.wi_0.weight model.decoder.block.22.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.22.layer.2.DenseReluDense.wi_1.weight model.decoder.block.22.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.22.layer.2.DenseReluDense.wo.weight model.decoder.block.22.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.22.layer.2.layer_norm.weight model.decoder.block.22.layer.2.layer_norm.weight\n",
      "model.decoder.block.23.layer.0.SelfAttention.q.weight model.decoder.block.23.layer.0.SelfAttention.q.weight\n",
      "model.decoder.block.23.layer.0.SelfAttention.k.weight model.decoder.block.23.layer.0.SelfAttention.k.weight\n",
      "model.decoder.block.23.layer.0.SelfAttention.v.weight model.decoder.block.23.layer.0.SelfAttention.v.weight\n",
      "model.decoder.block.23.layer.0.SelfAttention.o.weight model.decoder.block.23.layer.0.SelfAttention.o.weight\n",
      "model.decoder.block.23.layer.0.layer_norm.weight model.decoder.block.23.layer.0.layer_norm.weight\n",
      "model.decoder.block.23.layer.1.EncDecAttention.q.weight model.decoder.block.23.layer.1.EncDecAttention.q.weight\n",
      "model.decoder.block.23.layer.1.EncDecAttention.k.weight model.decoder.block.23.layer.1.EncDecAttention.k.weight\n",
      "model.decoder.block.23.layer.1.EncDecAttention.v.weight model.decoder.block.23.layer.1.EncDecAttention.v.weight\n",
      "model.decoder.block.23.layer.1.EncDecAttention.o.weight model.decoder.block.23.layer.1.EncDecAttention.o.weight\n",
      "model.decoder.block.23.layer.1.layer_norm.weight model.decoder.block.23.layer.1.layer_norm.weight\n",
      "model.decoder.block.23.layer.2.DenseReluDense.wi_0.weight model.decoder.block.23.layer.2.DenseReluDense.wi_0.weight\n",
      "model.decoder.block.23.layer.2.DenseReluDense.wi_1.weight model.decoder.block.23.layer.2.DenseReluDense.wi_1.weight\n",
      "model.decoder.block.23.layer.2.DenseReluDense.wo.weight model.decoder.block.23.layer.2.DenseReluDense.wo.weight\n",
      "model.decoder.block.23.layer.2.layer_norm.weight model.decoder.block.23.layer.2.layer_norm.weight\n",
      "model.decoder.final_layer_norm.weight model.decoder.final_layer_norm.weight\n",
      "model.lm_head.weight model.lm_head.weight\n"
     ]
    }
   ],
   "source": [
    "for k, v in old_weights:\n",
    "    new_k = k.replace('._orig_mod', '')\n",
    "    print(k, new_k)\n",
    "    weights[new_k] = v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<All keys matched successfully>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.load_state_dict(weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1d0d653134d04544b4c747fb8f33b06b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/nanot5-large-malaysian-cased/commit/7629865169da48e79f11a1390c1dd9fb455fc3de', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='7629865169da48e79f11a1390c1dd9fb455fc3de', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.model.push_to_hub('nanot5-large-malaysian-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
